#define vec2 float2
#define vec4 float4
#define rgb xyz
#define rgba xyzw

const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;

vec4 INPUT(image2d_t src_data, __global FilterParam* param, vec2 tc)
{
	tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, tc);
}

float clip_byte(float n)
{
	return clamp(n, 0.0f, 1.0f);
}

__kernel void MAIN(
      __read_only image2d_t src_data,
      __write_only image2d_t dest_data,
	  __global FilterParam* param,
	  int alpha)  		// the gpu items/threads should be newW*newH
{	
	int W = get_global_size(0);
	int H = get_global_size(1);
	int textH = param->height[0];;
	float iGlobalTime = param->cur_time / param->total_time;
		
	float2 iResolution = (float2)(W,H);
	int2 coordinate = (int2)(get_global_id(0), get_global_id(1));
	
	vec2 fragCoord = (vec2)(get_global_id0( param), get_global_id1( param));
	
    vec4 top0 = INPUT(src_data,  param, (vec2)(fragCoord.x, fragCoord.y + 1.0f)/iResolution.xy);
    vec4 top1 = INPUT(src_data,  param, (vec2)(fragCoord.x + 1.0f, fragCoord.y + 1.0f)/iResolution.xy);
    vec4 top2 = INPUT(src_data,  param, (vec2)(fragCoord.x + 2.0f, fragCoord.y + 1.0f)/iResolution.xy);
    
    vec4 mid0 = INPUT(src_data,  param, (vec2)(fragCoord.x, fragCoord.y)/iResolution.xy);
    vec4 mid1 = INPUT(src_data,  param, (vec2)(fragCoord.x + 1.0f, fragCoord.y)/iResolution.xy);
    vec4 mid2 = INPUT(src_data,  param, (vec2)(fragCoord.x + 2.0f, fragCoord.y)/iResolution.xy);
    
    vec4 bom0 = INPUT(src_data,  param, (vec2)(fragCoord.x, fragCoord.y - 1.0)/iResolution.xy);
    vec4 bom1 = INPUT(src_data,  param, (vec2)(fragCoord.x + 1.0f, fragCoord.y - 1.0f)/iResolution.xy);
    vec4 bom2 = INPUT(src_data,  param, (vec2)(fragCoord.x + 2.0f, fragCoord.y - 1.0f)/iResolution.xy);
    
    top0.z = clip_byte(top0.z - mid1.z);
    top0.y = clip_byte(top0.y - mid1.y);
    top0.x = clip_byte(top0.x - mid1.x);
    top1.z = clip_byte(top1.z - mid1.z);
    top1.y = clip_byte(top1.y - mid1.y);
    top1.x = clip_byte(top1.x - mid1.x);
    top2.z = clip_byte(top2.z - mid1.z);
    top2.y = clip_byte(top2.y - mid1.y);
    top2.x = clip_byte(top2.x - mid1.x);
    mid0.z = clip_byte(mid0.z - mid1.z);
    mid0.y = clip_byte(mid0.y - mid1.y);
    mid0.x = clip_byte(mid0.x - mid1.x);
    mid2.z = clip_byte(mid2.z - mid1.z);
    mid2.y = clip_byte(mid2.y - mid1.y);
    mid2.x = clip_byte(mid2.x - mid1.x);
    bom0.z = clip_byte(bom0.z - mid1.z);
    bom0.y = clip_byte(bom0.y - mid1.y);
    bom0.x = clip_byte(bom0.x - mid1.x);
    bom1.z = clip_byte(bom1.z - mid1.z);
    bom1.y = clip_byte(bom1.y - mid1.y);
    bom1.x = clip_byte(bom1.x - mid1.x);
    bom2.z = clip_byte(bom2.z - mid1.z);
    bom2.y = clip_byte(bom2.y - mid1.y);
    bom2.x = clip_byte(bom2.x - mid1.x);
	
	float b0, g0, r0, b1, g1, r1, b2, g2, r2, b, g, r;
	
	b0 = clip_byte(clip_byte(top0.z - top1.z) + top1.z - top2.z) + top2.z;
    g0 = clip_byte(clip_byte(top0.y - top1.y) + top1.y - top2.y) + top2.y;
    r0 = clip_byte(clip_byte(top0.x - top1.x) + top1.x - top2.x) + top2.x;
    
    b1 = clip_byte(mid0.z - mid2.z) + mid2.z;
    g1 = clip_byte(mid0.y - mid2.y) + mid2.y;
    r1 = clip_byte(mid0.x - mid2.x) + mid2.x;
    
    b2 = clip_byte(clip_byte(bom0.z - bom1.z) + bom1.z - bom2.z) + bom2.z;
    g2 = clip_byte(clip_byte(bom0.y - bom1.y) + bom1.y - bom2.y) + bom2.y;
    r2 = clip_byte(clip_byte(bom0.x - bom1.x) + bom1.x - bom2.x) + bom2.x;
    
    b = clip_byte(clip_byte(b0 - b1) + b1 - b2) + b2;
    g = clip_byte(clip_byte(g0 - g1) + g1 - g2) + g2;
    r = clip_byte(clip_byte(r0 - r1) + r1 - r2) + r2;
	
	float y1 = b * 255.0f;
	float x1 = sqrt(mid1.z * 255.0f) + 1.0f;
	float z = 1.4f * x1 / (2.f * x1 + y1);
	b = mid1.z + (1.0f - mid1.z) * z;
	
	y1 = g  * 255.0f;
	x1 = sqrt(mid1.y * 255.0f) + 1.0f;
	z = 1.4f * x1 / (2.0f * x1 + y1);
	g = mid1.y + (1.0f - mid1.y) * z;
	
	y1 = r  * 255.0f;
	x1 = sqrt(mid1.x * 255.0f) + 1.0f;
	z = 1.4f * x1 / (2.0f * x1 + y1);
	r = mid1.x + (1.0f - mid1.x) * z;
	
	float4 inBGRA = INPUT(src_data,  param, ((vec2)(fragCoord.x, fragCoord.y) + (vec2)(0.5f))/iResolution.xy);
	float4 tuneCol = (float4)(r, g, b, 1.0f);
	float4 outputCol = (float4)(inBGRA.xyz*(1.0f - (float)alpha/100.0f) + tuneCol.xyz*(float)alpha/100.0f, inBGRA.w);
	write_imagef(dest_data, coordinate, outputCol);
}
